
---
title: "Data cleaning of Stata dataset (produced by Gerda Asmus' code) and summary statistics"
author: "Pascal Geldsetzer"
date: "9/11/2017"
output: html_document
---

Latest update: 2018 - 02 - 15

Description: 
Run this R code before running the Stata do-file "3ie analysis_2018-02-18". This R code takes a partially cleaned Stata dataset (named by me "3ieforcleaning_2018-02-09.dta" - this is produced by the Stata do-files in the folder "Stata" under "Data cleaning"), cleans it further, does some summary stats, and then exports it to a Stata dataset for further analysis. Specifically, this code produces three datasets for analysis: 3ieforanalysis_2018-02-18.dta (for the analysis of the viral load failure endpoint), endlinecostdat_2018-02-12.dta for analysis of the secondary endpoint of healthcare expenditures, and baselinecostdat_2018-02-12.dta (to compare healthcare expenditures at baseline).


```{r load packages}
library(tidyverse) 
library(dplyr) 
library(haven)
library(forcats) # for categorical variables (R for data science rec) --> see https://rdrr.io/cran/forcats/man/fct_unify.html
library(stringr) # for manipulating string variables (R for data science rec)
library(lubridate) # for dates and times (R for data science rec)
#instalibrary(dummies) # to easily create dummies
library(reshape2) #useful for some efficient reshaping
library(ggplot2) 
library(ggrepel) # to avoid text labels in ggplot from overlapping
library(modelr) # to use "add_predictions()" for adding a column of predicted vals to your dataset
library(broom) # to create tidy data from model output
#library(margins) # R equivalent of Stata's margins command --> Thomas Leeper said this only to be used for marginal effects (not prediction)
#library(prediction) # Thomas Leeper's R package to get predicted probabilities
library(srvyr)  # survey package that also works with dplyr 
library(lmtest) # for likelihood ratio tests
#library(sandwich) # for robust standard errors 
#library(multiwayvcov) # for clustered standard errors
library(miceadds) # package to cluster SEs more easily than in multiwayvcov; it uses multiwayvcov, so the results between the two packages are exactly the same. 
library(speedglm)
library(foreign) # for importing non-csv datasets
library(readstata13) # foreign only reads Stata 12 or lower
#library(lme4) # for multi-level modeling
#library(lmerTest) # for p-values with the lmer command
#library(sjPlot) # for plotting lmer models
#library(texreg) # for tables
library(tableone) # Creates a table 1 (summary characteristics)
#library(mice) # md.pattern() function to see patterns of missing data 
#library(reshape) # to use the rescalar function

#library(car) # for easy attaching of new variables
#library(arm)
#library(mosaic)
#library(mosaicData)
library(mediation)  # for mediation analysis
#library(lattice)
#library(pander)

```



```{r Load data}
setwd("/Users/pgeldsetzer/Documents/aa SD/3ie ART home-delivery/Final report/Documentation/Final fully de-identified documentation shared with 3ie on April 30 2018/Raw data")
dat <- read_dta("Data for cleaning and summary stats_2018-04-30.dta")

```


```{r Filter by matches}
# ineligible matches for the analysis
ineligiblematches <- c("Baseline only", "Lab only", "Endline only", "Endline only (new participant)")

# Kick out all matching types without the logbook in it (as we define enrolment by the logbook)
dat <- dat %>% 
  mutate(matches = as.factor(matches)) %>% 
  filter(!(matches %in% ineligiblematches))

```


```{r clean facility and create intervention dummy}

# THIS IS NOW BASED ON THE FACILITY_ID VARIABLE (Gerda defined this as site_id_logbook and if that's not available, then facility_baseline, and if that is also not available then finally facility_endline)

# Kick out Kivule as not included in trial (facility 26 and 51 also not included but both have zero obs)
dat <- dat %>%  
  filter(facility_id != 43)

interventionfac = c(3, 6, 7, 9, 10, 13, 15, 16, 17, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42) # I have double checked in form files that this is correct


dat <- mutate(dat, 
              intervfac_num = ifelse(facility_id %in% interventionfac, 1, 0), 
              intervfac = as.factor(intervfac_num))


#  THE FOLLOWING IS THE DEFINITION OF BEING IN AN INTERVENTION FACILITY THAT WAS USED IN THE ORIGINAL REPORT TO 3IE; KEEPING IT HERE AS A ROBUSTNESS CHECK.
# I'm using the logbook site name to define who is in control and intervention. 
# Make everything small letters, remove the words "dispensary" from the facility string (so mwenge = mwenge dispensary), and delete all whitespaces.
dat <- dat %>% 
  mutate(facility_clean = str_to_lower(site_logbook),
         facility_clean = str_replace(facility_clean, "dispensary", ""),
         facility_clean = str_replace_all(facility_clean, fixed(" "), ""))


arthomedel <- c("kimbiji", "mjimwema", "tambukareli", "buza", "arafaugweno", "mbagalarangitatu", "keko", 
                  "makangarawe", "toangoma", "tandale", "mburahati", "mwenge", "mbezi", "hananasif", "kigogo",
                  "mabibo", "goba", "tabata", "vingunguti", "kitunda", "pugu", "tabatanbc", "kinyerezi",
                  "mongolandege")
control <- c("kimara", "kijitonyama", "kichemchem", "bunju", "majimatitu", "magomeni", "mvuti", "makuburi",
             "gerezani", "temeke", "segerea", "kigamboni", "kingugi", "mnazimmoja", "sandali", "chanika",
             "kawe", "kisarawe", "kinondoni", "kibada", "ununio", "roundtable", "majohe", "kiwalani")
                   
dat <- mutate(dat, 
              intervfac_numrobust = ifelse(facility_clean %in% arthomedel, 1, 0),
              intervfacrobust = as.factor(intervfac_numrobust))

```




```{r Create outcome variable}
# No cleaning of vl_endline needed. The only two vl_endline values that I'm wondering might be data entry errors are 99996 and 121212 but it's only one each, so not an issue.

# Cleaning of vl_baseline
oddvls <- c(999, 9999, 99999, 99996, 519999, 999999, 9999968)

dat <- dat %>% 
  mutate(vl_baseline = ifelse(vl_baseline %in% oddvls, NA, vl_baseline))

# Cleaning of cd4_number
oddcd4 <- c(999, 9999, 99999, 999999)

dat <- dat %>% 
  mutate(cd4_number = ifelse(cd4_number_baseline %in% oddcd4, NA, cd4_number_baseline))


# Clean endline vl according to follow-up time
dat <- dat %>% 
  mutate(
    date_baseline_clean = as.Date(ifelse(date_baseline<"2016-02-15" | date_baseline>"2017-03-30", NA, date_baseline)), # sets implausible baseline dates to missing
         
         vl_date_endline_clean = as.Date(ifelse(vl_date_endline<=date_baseline_clean | vl_date_endline>"2018-02-18", NA, vl_date_endline)), # sets vl_date_endline to missing if the endline vl was before the enrolment date
    
    futime = vl_date_endline_clean - date_baseline_clean, # this is the number of days between the endline VL and the baseline interview (no one with a missing vl_endline has a missing date_baseline)
    
    futimegrt60 = ifelse(futime>=60, 1, 0),
    futimegrt90 = ifelse(futime>=90, 1, 0),
    futimegrt120 = ifelse(futime>=120, 1, 0),
    futimegrt150 = ifelse(futime>=150, 1, 0)) %>% 
  dplyr::select(-fu_time, -fu_time_2, -fu_time_3)


# Clean baseline vl/cd4 according to time between baseline and endline measurement
dat <- dat %>% 
  mutate(vl_date_baseline = as.Date(vl_date_baseline),
         vl_date_baseline_clean = as.Date(ifelse(vl_date_baseline<"2010-01-01" | vl_date_baseline>"2017-07-01", NA, vl_date_baseline)), # set implausible vl_date_baselines to missing
         
         cd4_date_baseline = as.Date(cd4_date_baseline), 
         cd4_date_baseline_clean = as.Date(ifelse(cd4_date_baseline<"2010-01-01" | cd4_date_baseline > "2017-07-01", NA, cd4_date_baseline)), # set implausible cd4_date_baselines to missing
         
         cd4_baseline_clean = ifelse(cd4_number_baseline>=9999, NA, cd4_number_baseline), # clean cd4_baseline
         cd4_baseline_clean = ifelse(is.na(cd4_date_baseline_clean)==F & is.na(vl_date_endline_clean)==F & vl_date_endline_clean - cd4_date_baseline_clean<=10, NA, cd4_baseline_clean),  # set cd4_baseline_clean to missing if there has been insufficient time between the endline vl and the baseline cd4
         
         vl_baseline_clean = ifelse(is.na(vl_date_baseline_clean)==F & is.na(vl_date_endline_clean)==F & vl_date_endline_clean - vl_date_baseline_clean<=10, NA, vl_baseline)) # set vl_baseline to missing if there has been insufficient time between the endline vl and the baseline vl

# Create a variable that is the time between the baseline vl/cd4 and the endline vl
dat <- dat %>% 
  mutate(
    date_baselinebiomarker = as_date(
       ifelse(is.na(vl_baseline_clean)==F, vl_date_baseline_clean, 
          ifelse(is.na(cd4_baseline_clean)==F, cd4_date_baseline_clean, NA))),
    timebetweenvls = as.numeric(
       vl_date_endline_clean - date_baselinebiomarker))
           
           
#  viral failure is defined as VL>=1000. At baseline, viral failure is defined as non-suppressed vl or, if vl missing, then cd4<350. 
dat <- dat %>% 
  mutate(vlfail_base = 
           ifelse(is.na(vl_baseline_clean)==T, NA,
               ifelse(vl_baseline_clean >= 1000, 1, 0)),
         
         vlorcd4fail_base_num =  
           ifelse(is.na(vl_baseline_clean)==F, vlfail_base, 
                ifelse(is.na(cd4_baseline_clean)==T, NA,
                    ifelse(cd4_baseline_clean<350, 1, 0))),
         
         vlorcd4fail_base = as.factor(vlorcd4fail_base_num),
         
         vlfail_end_num = 
            ifelse(is.na(vl_endline)==T, NA,
                ifelse(vl_endline >=1000, 1, 0)),
         
         vlfail_end = as.factor(vlfail_end_num))

```



```{r Create vars describing the intervention}
dat <- dat %>% 
  mutate(intervrec_num =  
         ifelse(arvinterventionyesno_update=="yes", 1, 0),
         # set those to control who are marked as having received intervention but don't attend an intervention facility (these are 7 people; 0 people if you use intervfacrobust)
         intervrec_num = ifelse(intervfac=="0" & intervrec_num==1, 0, intervrec_num),

         intervrec = as.factor(intervrec_num),
         
         timeonarthomedel = as.numeric(as.Date(vl_date_endline_clean) - as.Date(dateenrolledintheinterventio)), 
         # Note that for a substantial proportion of intervention recipients, the endline vl occurredd before the enrolment into the intervention (dateenrolledintheinterventio doesn't have totally implausible values)
         
         timeonarthomedelgrt30 = ifelse(timeonarthomedel>=30, 1, 0),
         timeonarthomedelgrt90 = ifelse(timeonarthomedel>=90, 1, 0), 
         timeonarthomedelgrt180 = ifelse(timeonarthomedel>=180, 1, 0),
         timeonarthomedelgrt270 = ifelse(timeonarthomedel>=270, 1, 0),
         
         intervrec_num_onlygrt90 = as.numeric(
           ifelse(is.na(timeonarthomedel)==T, 0, 
                  ifelse(intervrec==1 & timeonarthomedel>=90, 1, 0))), 
         intervrec_num_onlygrt180 = as.numeric(
           ifelse(is.na(timeonarthomedel)==T, 0, 
                  ifelse(intervrec==1 & timeonarthomedel>=180, 1, 0))))

```



```{r Clean socio-demographic variables}

dat <- dat %>% 
  mutate(
    # age
    age_clean = 
      ifelse(str_detect(matches, "Baseline")==TRUE, a2_baseline,
        ifelse(str_detect(matches, "Endline")==TRUE, a2_endline, age_logbook)),
    age_clean = 
      ifelse(is.na(age_clean)==T & is.na(a2_baseline)==F, a2_baseline, 
        ifelse(is.na(age_clean)==T & is.na(a2_baseline)==T & is.na(a2_endline)==F, a2_endline, 
            ifelse(is.na(age_clean)==T & is.na(a2_baseline)==T & is.na(a2_endline)==T & is.na(age_logbook)==F,
                   age_logbook, age_clean))),
    age_clean = ifelse(age_clean>110, NA, age_clean),
    
    age_grp = as.factor(
      ifelse(age_clean<=25, "18-25", 
          ifelse(age_clean>25 & age_clean<=35, "26-35",
              ifelse(age_clean>35 & age_clean<=45, "36-45",
                  ifelse(age_clean>45 & age_clean<=55, "46-55",
                      ifelse(age_clean>55 & age_clean<=65, "56-65", 
                             ifelse(age_clean>65, ">65", NA))))))),
    
    # sex
    sex_logbook = ifelse(as.factor(sex_logbook)=="f", 1, 
                    ifelse(as.factor(sex_logbook)=="m", 2, NA)), 
    male = 
      ifelse(str_detect(matches, "Baseline")==TRUE, a1_baseline-1,
        ifelse(str_detect(matches, "Endline")==TRUE, a1_endline-1, sex_logbook-1)),
    male_num = 
      ifelse(is.na(male)==T & is.na(a1_baseline)==F, a1_baseline-1, 
        ifelse(is.na(male)==T & is.na(a1_baseline)==T & is.na(a1_endline)==F, a1_endline-1, 
                ifelse(is.na(male)==T & is.na(a1_baseline)==T & is.na(a1_endline)==T & is.na(sex_logbook)==F,
                       sex_logbook-1, male))),
    male = as.factor(male_num),

    # married
    married_baseline = ifelse(a3_baseline==1, 1, 
                              ifelse(a3_baseline>1, 0, NA)),
    married_endline = ifelse(a3_endline==1, 1, 
                              ifelse(a3_endline>1, 0, NA)),
    married = as.factor(
      ifelse(str_detect(matches, "Baseline")==TRUE, married_baseline,
        ifelse(str_detect(matches, "Endline")==TRUE, married_endline, NA))),
    married_num = as.numeric(married)-1,
    
    # education
    educ_baseline = 
      ifelse(a8_baseline==1 | a8_baseline==2, "<Primary school",
             ifelse(a8_baseline==3, "Primary school",
                    ifelse(a8_baseline==4 | a8_baseline==5, "Secondary school", NA))),
    educ_endline = 
      ifelse(a8_baseline==1 | a8_baseline==2, "<Primary school",
             ifelse(a8_baseline==3, "Primary school",
                    ifelse(a8_baseline==4 | a8_baseline==5, "Secondary school", NA))),
    educ = as.factor( 
      ifelse(str_detect(matches, "Baseline")==TRUE, educ_baseline,
        ifelse(str_detect(matches, "Endline")==TRUE, educ_endline, NA))),
    
   # informed anyone of HIV status
   hivdisclos_baseline = as.factor(
     ifelse(b1_baseline==1, 1, 
        ifelse(b1_baseline==0, 0, NA))),
   hivdisclos_endline = ifelse(b1_endline==1, 1, 
                                ifelse(b1_endline==0, 0, NA)),
   hivdisclos = 
      ifelse(str_detect(matches, "Baseline")==TRUE, hivdisclos_baseline,
        ifelse(str_detect(matches, "Endline")==TRUE, hivdisclos_endline, NA)),
   
   # taken ART for how long
   artlength_baseline = as.numeric(mdy(today_baseline_baseline)-mdy(b4_baseline)),
   artlength_endline = as.numeric(mdy(today_endline)-mdy(b4_endline)),
   
   artlength_baseline = ifelse(b7_baseline==2 & is.na(b7_baseline)==F, NA, artlength_baseline),
   artlength_endline = ifelse(b7_endline==2 & is.na(b7_baseline)==F, NA, artlength_endline),
   
   artlength_baseline = ifelse(artlength_baseline>10000 | artlength_baseline<0, NA, artlength_baseline),
   artlength_endline = ifelse(artlength_endline>10000 | artlength_endline<0, NA, artlength_endline),

   artlength_baseline_cat = as.factor(
     ifelse(artlength_baseline<90, "<90 days", 
         ifelse(artlength_baseline>=90 & artlength_baseline<180, "90-179 days",
              ifelse(artlength_baseline>=180 & artlength_baseline <365, "180-364 days",
                   ifelse(artlength_baseline>=365 & artlength_baseline<1095, "1 to <3 years",
                        ifelse(artlength_baseline>=1095 & artlength_baseline<1825, "3 to <5 years",
                             ifelse(artlength_baseline>=1825, ">= 5 years", NA))))))))   
   
   
dat$age_grp <- factor(dat$age_grp, levels = c("18-25", "26-35", "36-45", "46-55", "56-65", ">65"))
dat <- within(dat, age_grp <- relevel(age_grp, ref = "18-25"))
dat$artlength_baseline_cat <- factor(dat$artlength_baseline_cat, 
                                     levels = c("<90 days", "90-179 days", "180-364 days", "1 to <3 years", "3 to <5 years", ">= 5 years"))
dat <- within(dat, artlength_baseline_cat <- relevel(artlength_baseline_cat, ref = "<90 days"))

  
```



```{r Create dummy for the matched facility pairs}

# This loads a csv document with the facility_id, facility name, number of ART patients, and number of CHWs
artvol.dat <- read_csv("artvolume_2018-02-09.csv")
artvol.dat <- artvol.dat %>% 
  mutate(facility_id = as.numeric(facility_id), 
         facility_cleanforpairs = str_to_lower(facility),
         facility_cleanforpairs = str_replace_all(facility_cleanforpairs, fixed(" "), ""))
dat <- left_join(dat, artvol.dat, by="facility_id")



dat <- dat %>% 
  mutate(pair = as.factor(
    # Temeke
           ifelse(facility_cleanforpairs=="mbagalarangitatu" | facility_cleanforpairs=="temeke", 1, 
           ifelse(facility_cleanforpairs=="kigamboni" | facility_cleanforpairs=="tambukareli", 2,
           ifelse(facility_cleanforpairs=="makangarawe" | facility_cleanforpairs=="roundtable", 3,
           ifelse(facility_cleanforpairs=="majimatitu" | facility_cleanforpairs=="toangoma", 4,
           ifelse(facility_cleanforpairs=="buza" | facility_cleanforpairs=="kichemchem", 5, 
           ifelse(facility_cleanforpairs=="arafaugweno" | facility_cleanforpairs=="kingugi", 6,
           ifelse(facility_cleanforpairs=="mjimwema" | facility_cleanforpairs=="sandali", 7,
           ifelse(facility_cleanforpairs=="kimbiji" | facility_cleanforpairs=="kibada", 8, 
           ifelse(facility_cleanforpairs=="keko" | facility_cleanforpairs=="kisarawe", 9, 
    
    # Kinondoni       
           ifelse(facility_cleanforpairs=="tandale" | facility_cleanforpairs=="magomeni", 10,
           ifelse(facility_cleanforpairs=="kimara" | facility_cleanforpairs=="mburahati", 11, 
           ifelse(facility_cleanforpairs=="mwenge" | facility_cleanforpairs=="bunju", 12, 
           ifelse(facility_cleanforpairs=="kawe" | facility_cleanforpairs=="mbezi", 13, 
           ifelse(facility_cleanforpairs=="hananasif" | facility_cleanforpairs=="kijitonyama", 14,
           ifelse(facility_cleanforpairs=="kinondoni" | facility_cleanforpairs=="kigogo", 15,
           ifelse(facility_cleanforpairs=="makuburi" | facility_cleanforpairs=="mabibo", 16,
           ifelse(facility_cleanforpairs=="goba" | facility_cleanforpairs=="ununio", 17,
    
    # Ilala
           ifelse(facility_cleanforpairs=="mnazimmoja" | facility_cleanforpairs=="tabata", 18,
           ifelse(facility_cleanforpairs=="vingunguti" | facility_cleanforpairs=="chanika", 19, 
           ifelse(facility_cleanforpairs=="kitunda" | facility_cleanforpairs=="segerea", 20, 
           ifelse(facility_cleanforpairs=="pugu" | facility_cleanforpairs=="kiwalani", 21, 
           ifelse(facility_cleanforpairs=="gerezani" | facility_cleanforpairs=="tabatanbc", 22, 
           ifelse(facility_cleanforpairs=="kinyerezi" | facility_cleanforpairs=="majohe", 23, 
           ifelse(facility_cleanforpairs=="mongolandege" | facility_cleanforpairs=="mvuti", 24, NA))))))))))))))))))))))))))

```


```{r delete those with missing endline vl}
dat <- dat %>% 
  mutate(ltfu = as.factor(ifelse(is.na(vlfail_end)==T, 1, 0)))

datnomiss <- dat %>% 
  filter(is.na(vlfail_end)==F)

```


```{r export to '3ie for analysis' dataset (vl only, endlinecost.dat is for healthcare exp)}

write_dta(dat, "3ieforanalysis_2018-04-30.dta", version=13)

```


```{r Exploration of results}

# First, just by control vs. intervention facility --> slightly worse in control facs
interv.dat <- datnomiss %>% 
  filter(intervfac==1)

control.dat <- datnomiss %>% 
  filter(intervfac==0)

mean(interv.dat$vlfail_end_num) # 9.65
mean(control.dat$vlfail_end_num) # 10.89


# Second, among those who received the intervention 
receivedint.dat <- datnomiss %>% 
  filter(intervfac==1 & intervrec_num==1) # 453 obs

mean(receivedint.dat$vlfail_end_num) # 5.74


# Third, among those who received the intervention by different times on arthomedel --> same vl failure mean regardless of timeonarthomedel
ggplot(data=receivedint.dat) + geom_histogram(aes(x=timeonarthomedel))

timeonarthomedelless0.dat <- receivedint.dat %>% 
  filter(timeonarthomedel<=0)
mean(timeonarthomedelless0.dat$vlfail_end_num) # 1.47
mean(timeonarthomedelless0.dat$futime) # 255

timeonarthomedelgrt30.dat <- receivedint.dat %>% 
  filter(timeonarthomedel>=30)
mean(timeonarthomedelgrt30.dat$vlfail_end_num) # 7.3
mean(timeonarthomedelgrt30.dat$futime) # 322

timeonarthomedelgrt90.dat <- receivedint.dat %>% 
  filter(timeonarthomedel>=90)
mean(timeonarthomedelgrt90.dat$vlfail_end_num) # 7.7
mean(timeonarthomedelgrt90.dat$futime) # 325

timeonarthomedelgrt180.dat <- receivedint.dat %>% 
  filter(timeonarthomedel>=180)
mean(timeonarthomedelgrt180.dat$vlfail_end_num) # 7.1
mean(timeonarthomedelgrt180.dat$futime) # 354

timeonarthomedelgrt270.dat <- receivedint.dat %>% 
  filter(timeonarthomedel>=270)
mean(timeonarthomedelgrt270.dat$vlfail_end_num) # 7.2
mean(timeonarthomedelgrt270.dat$futime) # 407

ggplot(data=receivedint.dat) + geom_smooth(aes(y=vlfail_end_num, x=timeonarthomedel))


# Fourth, among those who are stable --> same between control and interv
stableinterv.dat <- datnomiss %>% 
  filter(intervfac==1 & vlorcd4fail_base_num==0)
mean(stableinterv.dat$vlfail_end_num) # 4.6 

stablecontrol.dat <- datnomiss %>% 
  filter(intervfac==0 & vlorcd4fail_base_num==0)
mean(stablecontrol.dat$vlfail_end_num) # 4.3


# Fifth, by different follow up times --> not much of a pattern
ggplot(data=datnomiss) + geom_histogram(aes(x=futime))
 
intervfutimegrt60.dat <- datnomiss %>% 
  filter(intervfac==1 & futime>=60)
mean(intervfutimegrt60.dat$vlfail_end_num) # 0.1002331

intervfutimegrt120.dat <- datnomiss %>% 
  filter(intervfac==1 & futime>=120)
mean(intervfutimegrt120.dat$vlfail_end_num) # 0.1011905

intervfutimegrt240.dat <- datnomiss %>% 
  filter(intervfac==1 & futime>=240)
mean(intervfutimegrt240.dat$vlfail_end_num) # 0.09869376

intervfutimegrt360.dat <- datnomiss %>% 
  filter(intervfac==1 & futime>=360)
mean(intervfutimegrt360.dat$vlfail_end_num) # 0.1114458

intervfutimegrt440.dat <- datnomiss %>% 
  filter(intervfac==1 & futime>=440)
mean(intervfutimegrt440.dat$vlfail_end_num) # 0.1125

controlfutimegrt60.dat <- datnomiss %>% 
  filter(intervfac==0 & futime>=60)
mean(controlfutimegrt60.dat$vlfail_end_num) # 0.1044957

controlfutimegrt120.dat <- datnomiss %>% 
  filter(intervfac==0 & futime>=120)
mean(controlfutimegrt120.dat$vlfail_end_num) # 0.1015038

controlfutimegrt240.dat <- datnomiss %>% 
  filter(intervfac==0 & futime>=240)
mean(controlfutimegrt240.dat$vlfail_end_num) # 0.1064468

controlfutimegrt360.dat <- datnomiss %>% 
  filter(intervfac==0 & futime>=360)
mean(controlfutimegrt360.dat$vlfail_end_num) # 0.1201299

controlfutimegrt440.dat <- datnomiss %>% 
  filter(intervfac==0 & futime>=440)
mean(controlfutimegrt440.dat$vlfail_end_num) # 0.1090909


# Sixth, by different follow up times among those who're stable --> slight increase with increasing fu_time but the increase is stronger in control arm
 
intervfutimegrt60.dat <- stableinterv.dat %>% 
  filter(intervfac==1 & futime>=60)
mean(intervfutimegrt60.dat$vlfail_end_num) # 0.04991948

intervfutimegrt120.dat <- stableinterv.dat %>% 
  filter(intervfac==1 & futime>=120)
mean(intervfutimegrt120.dat$vlfail_end_num) # 0.04942339

intervfutimegrt240.dat <- stableinterv.dat %>% 
  filter(intervfac==1 & futime>=240)
mean(intervfutimegrt240.dat$vlfail_end_num) # 0.04970179

intervfutimegrt360.dat <- stableinterv.dat %>% 
  filter(intervfac==1 & futime>=360)
mean(intervfutimegrt360.dat$vlfail_end_num) # 0.05381166

intervfutimegrt440.dat <- stableinterv.dat %>% 
  filter(intervfac==1 & futime>=440)
mean(intervfutimegrt440.dat$vlfail_end_num) # 0.06060606

controlfutimegrt60.dat <- stablecontrol.dat %>% 
  filter(intervfac==0 & futime>=60)
mean(controlfutimegrt60.dat$vlfail_end_num) # 0.04166667

controlfutimegrt120.dat <- stablecontrol.dat %>% 
  filter(intervfac==0 & futime>=120)
mean(controlfutimegrt120.dat$vlfail_end_num) # 0.04266212

controlfutimegrt240.dat <- stablecontrol.dat %>% 
  filter(intervfac==0 & futime>=240)
mean(controlfutimegrt240.dat$vlfail_end_num) # 0.05060729

controlfutimegrt360.dat <- stablecontrol.dat %>% 
  filter(intervfac==0 & futime>=360)
mean(controlfutimegrt360.dat$vlfail_end_num) # 0.06849315

controlfutimegrt440.dat <- stablecontrol.dat %>% 
  filter(intervfac==0 & futime>=440)
mean(controlfutimegrt440.dat$vlfail_end_num) # 0.06722689

```




```{r Calculate satisfaction among those receiving intervention}

# Analysis for this statement: 97% of those who received ART community delivery reported to be either “satisfied” or “very satisfied” with the program.
# Notes: There are three different variables to define who has received the intervention: arvinterventionyesno_logbook, arvinterventionyesno, and arvinterventionyesno_update. I'm using arvinterventionyesno_update primarily (and the other two as robustness checks only) .

sat.dat <- dat %>% 
  filter(arvinterventionyesno_update=="yes") %>% 
  mutate(satisfiedwithint = ifelse(is.na(d26z_endline)==T, NA, 
                                ifelse(d26z_endline==1 | d26z_endline==2, 1, 0)))

summary(as.factor(sat.dat$satisfiedwithint)) # yes=310, no=9

library(prevalence)
sat.dat <- sat.dat %>% 
  filter(is.na(satisfiedwithint)==F)
propCI(x=310, n=length(sat.dat$satisfiedwithint), method="all", level=0.95)


```



```{r Table 1, eval = FALSE}
table1names <- c("male", 
                 "age_clean", "age_grp", 
                 "educ", 
                 "married", "married_num",
                 "artlength_baseline", "artlength_baseline_cat",
                 "hivdisclos_baseline", 
                 "intervrec", "intervrec_num",
                 "vlorcd4fail_base", "vlorcd4fail_base_num", "vlfail_end", "vlfail_end_num",
                 "intervfac", "ltfu")



# Table 1 with missings (excluding LTFU)
table1miss <- CreateTableOne(vars = table1names, data=datnomiss, includeNA=T, strata="intervfac")
table1miss

table1miss <- print(table1miss, quote = FALSE, noSpaces = TRUE, printToggle = FALSE)


write.csv(table1miss, "table1miss_2018-02-15.csv")

# Table 1 without missings (excluding LTFU)
table1nomiss <- CreateTableOne(vars = table1names, data=datnomiss, includeNA=F, strata="intervfac")
table1nomiss

table1nomiss <- print(table1nomiss, quote = FALSE, noSpaces = TRUE, printToggle = FALSE)


write.csv(table1nomiss, "table1nomiss_2018-02-15.csv")

# Table 1 comparing LTFU to no LTFU (with missings)
table1ltfumiss <- CreateTableOne(vars = table1names, data=dat, includeNA=T, strata=c("ltfu", "intervfac"))
table1ltfumiss

table1ltfumiss <- print(table1ltfumiss, quote = FALSE, noSpaces = TRUE, printToggle = FALSE)


write.csv(table1ltfumiss, "table1ltfumiss_2018-02-15.csv")

# Table 1 comparing LTFU to no LTFU (without missings)
table1ltfunomiss <- CreateTableOne(vars = table1names, data=dat, includeNA=F, strata=c("ltfu", "intervfac"))
table1ltfunomiss

table1ltfunomiss <- print(table1ltfunomiss, quote = FALSE, noSpaces = TRUE, printToggle = FALSE)


write.csv(table1ltfunomiss, "table1ltfunomiss_2018-02-15.csv")

```


```{r Calculate time taken for an ART care visit, eval=FALSE}


# NOTE: I gave up on calculating time because AM/PM wasn't entered correctly for c120_baseline and c120_endline)

dat <- dat %>% 
  mutate(arrivaltime_baseline = hms(c120_baseline),
         arrivaltime_endline = hms(c120_endline),
         
         interviewstart_baseline = str_sub(starttime_baseline, -11, -1),

         
         
         # baseline
         interviewstart_baseline = str_sub(starttime_baseline, -11, -1),
         interviewstart_baseline = str_split(interviewstart_baseline, "", simplify = TRUE))
         
         
         ,
         interviewstart_baseline = hms(interviewstart_baseline),
         
         
         durationbaselinevisit = as.duration(interviewstart_baseline - arrivaltime_baseline),
         arrivalafterinterviewstart_baseline = ifelse(arrivaltime_baseline>interviewstart_baseline, 1, 0), 
         durationbaselinevisit = as.duration(ifelse(arrivalafterinterviewstart_baseline==1, NA, durationbaselinevisit)),
         
         # endline
         interviewstart_endline = str_sub(starttime_endline, -11, -1),
         interviewstart_endline = hms(interviewstart_endline),
         
         
         durationendlinevisit = as.duration(interviewstart_endline - arrivaltime_endline),
         arrivalafterinterviewstart_endline = ifelse(arrivaltime_endline>interviewstart_endline, 1, 0), 
         durationendlinevisit = as.duration(ifelse(arrivalafterinterviewstart_endline==1, NA, durationendlinevisit)))
         


ggplot(data=dat) + geom_histogram(aes(x=as.numeric(durationbaselinevisit)))




```


```{r Calculate cost of today's visit}
# matches with logbook but without baseline in it
matcheswobaseline <- c("Logbook only", "Logbook x Endline", "Logbook x Endline (new participant)", "Logbook x Lab")
matcheswoendline <- c("Logbook only", "Logbook x Baseline", "Logbook x Baseline x Lab", "Logbook x Endline (new participant)", "Logbook x Lab")

# Kick out all matching types without the logbook in it (as we define sample by logbook)
baselinecost.dat <- dat %>% 
  filter(!(matches %in% matcheswobaseline))

endlinecost.dat <- dat %>% 
  filter(!(matches %in% matcheswoendline))


############## Today's visit - baseline ###################
baselinecost.dat <- mutate(baselinecost.dat, 
              todcons = ifelse(c2_a_baseline<0 | c2_a_baseline==98 | c2_a_baseline==99 | is.na(c2_a_baseline)==T, 
                               0, c2_a_baseline),
              todtests = ifelse(c2_b_baseline<0 | c2_b_baseline==98 | c2_b_baseline==99 | is.na(c2_b_baseline)==T, 
                                0, c2_b_baseline),
              todmeds = ifelse(c2_c_baseline<0 | c2_c_baseline==98 | c2_c_baseline==99  | is.na(c2_c_baseline)==T, 
                               0, c2_c_baseline),
              todtrans = ifelse(c2_d_baseline<0 | c2_d_baseline==98 | c2_d_baseline==99 | is.na(c2_d_baseline)==T, 
                                0, 2*c2_d_baseline), # times two because transport is one-way
              todchild = ifelse(c2_e_baseline<0 | c2_e_baseline==98 | c2_e_baseline==99 | is.na(c2_e_baseline)==T, 
                                0, c2_e_baseline),
              todfood = ifelse(c2_f_baseline<0 | c2_f_baseline==98 | c2_f_baseline==99 | is.na(c2_f_baseline)==T, 
                               0, c2_f_baseline),
              todphone = ifelse(c2_g_baseline<0 | c2_g_baseline==98 | c2_g_baseline==99 | is.na(c2_g_baseline)==T, 
                                0, c2_g_baseline),
              todother = ifelse(c2_h_baseline<0 | c2_h_baseline==98 | c2_h_baseline==99 | is.na(c2_h_baseline)==T, 
                                0, c2_h_baseline),
              todtime = ifelse(c126_baseline<0 | c126_baseline==98 | c126_baseline==99 | is.na(c126_baseline)==T, 
                               0, c126_baseline),
              todcost = todcons + todtests + todmeds + todtrans + todchild +todfood + todphone + todother + todtime)


endlinecost.dat <- mutate(endlinecost.dat, 
              todcons = ifelse(c2_a_endline<0 | c2_a_endline==98 | c2_a_endline==99 | is.na(c2_a_endline)==T, 
                               0, c2_a_endline),
              todtests = ifelse(c2_b_endline<0 | c2_b_endline==98 | c2_b_endline==99 | is.na(c2_b_endline)==T, 
                                0, c2_b_endline),
              todmeds = ifelse(c2_c_endline<0 | c2_c_endline==98 | c2_c_endline==99  | is.na(c2_c_endline)==T, 
                               0, c2_c_endline),
              todtrans = ifelse(c2_d_endline<0 | c2_d_endline==98 | c2_d_endline==99 | is.na(c2_d_endline)==T, 
                                0, 2*c2_d_endline), # times two because transport is one-way
              todchild = ifelse(c2_e_endline<0 | c2_e_endline==98 | c2_e_endline==99 | is.na(c2_e_endline)==T, 
                                0, c2_e_endline),
              todfood = ifelse(c2_f_endline<0 | c2_f_endline==98 | c2_f_endline==99 | is.na(c2_f_endline)==T, 
                               0, c2_f_endline),
              todphone = ifelse(c2_g_endline<0 | c2_g_endline==98 | c2_g_endline==99 | is.na(c2_g_endline)==T, 
                                0, c2_g_endline),
              todother = ifelse(c2_h_endline<0 | c2_h_endline==98 | c2_h_endline==99 | is.na(c2_h_endline)==T, 
                                0, c2_h_endline),
              todtime = ifelse(c126_endline<0 | c126_endline==98 | c126_endline==99 | is.na(c126_endline)==T, 
                               0, c126_endline),
              todcost = todcons + todtests + todmeds + todtrans + todchild +todfood + todphone + todother + todtime)


```



```{r Calculate costs for primary care in last six months}
############## Visits in the last six months ###################

# Clean the visited yes or no variable
endlinecost.dat <- endlinecost.dat %>% 
  mutate(
              pub = ifelse(c6_a_endline==1, 1, 
                           ifelse(c6_a_endline == 99, NA, 0)),
              priv = ifelse(c6_b_endline==1, 1, 
                           ifelse(c6_b_endline == 99, NA, 0)),
              chem = ifelse(c6_c_endline==1, 1, 
                           ifelse(c6_c_endline == 99, NA, 0)),
              trad = ifelse(c6_d_endline==1, 1, 
                           ifelse(c6_d_endline == 99, NA, 0)),
              div = ifelse(c6_e_endline==1, 1, 
                           ifelse(c6_e_endline == 99, NA, 0)),
              faith = ifelse(c6_f_endline==1, 1, 
                           ifelse(c6_f_endline == 99, NA, 0)))

baselinecost.dat <- baselinecost.dat %>% 
  mutate(
              pub = ifelse(c6_a_baseline==1, 1, 
                           ifelse(c6_a_baseline == 99, NA, 0)),
              priv = ifelse(c6_b_baseline==1, 1, 
                           ifelse(c6_b_baseline == 99, NA, 0)),
              chem = ifelse(c6_c_baseline==1, 1, 
                           ifelse(c6_c_baseline == 99, NA, 0)),
              trad = ifelse(c6_d_baseline==1, 1, 
                           ifelse(c6_d_baseline == 99, NA, 0)),
              div = ifelse(c6_e_baseline==1, 1, 
                           ifelse(c6_e_baseline == 99, NA, 0)),
              faith = ifelse(c6_f_baseline==1, 1, 
                           ifelse(c6_f_baseline == 99, NA, 0)))

# Create indicator for visited a public OR private provider during past six months
endlinecost.dat <- endlinecost.dat %>% 
  mutate(visitedpuborpriv = ifelse(pub==1 | priv == 1, 1, 0))

mytable <- xtabs(~visitedpuborpriv + intervfac, data=endlinecost.dat)
ftable(mytable)  

# Clean the number of visits variables (setting to 0 if implausible value)
endlinecost.dat <- endlinecost.dat %>%  
  mutate(
              numpub = ifelse(c6_a1_endline<0 | c6_a1_endline>40 | is.na(c6_a1_endline)==T, 0, c6_a1_endline),
              numpriv = ifelse(c6_b1_endline<0 | c6_b1_endline>40 | is.na(c6_b1_endline)==T, 0, c6_b1_endline),
              numchem = ifelse(c6_c1_endline<0 | c6_c1_endline>40 | is.na(c6_c1_endline)==T, 0, c6_c1_endline),
              numtrad = ifelse(c6_d1_endline<0 | c6_d1_endline>40 | is.na(c6_d1_endline)==T, 0, c6_d1_endline),
              numdiv = ifelse(c6_e1_endline<0 | c6_e1_endline>40 | is.na(c6_e1_endline)==T, 0, c6_e1_endline),
              numfaith = ifelse(c6_f1_endline<0 | c6_f1_endline>40 | is.na(c6_f1_endline)==T, 0, c6_f1_endline))

baselinecost.dat <- baselinecost.dat %>%  
  mutate(
              numpub = ifelse(c6_a1_baseline<0 | c6_a1_baseline>40 | is.na(c6_a1_baseline)==T, 0, c6_a1_baseline),
              numpriv = ifelse(c6_b1_baseline<0 | c6_b1_baseline>40 | is.na(c6_b1_baseline)==T, 0, c6_b1_baseline),
              numchem = ifelse(c6_c1_baseline<0 | c6_c1_baseline>40 | is.na(c6_c1_baseline)==T, 0, c6_c1_baseline),
              numtrad = ifelse(c6_d1_baseline<0 | c6_d1_baseline>40 | is.na(c6_d1_baseline)==T, 0, c6_d1_baseline),
              numdiv = ifelse(c6_e1_baseline<0 | c6_e1_baseline>40 | is.na(c6_e1_baseline)==T, 0, c6_e1_baseline),
              numfaith = ifelse(c6_f1_baseline<0 | c6_f1_baseline>40 | is.na(c6_f1_baseline)==T, 0, c6_f1_baseline))
              
# Clean the cost item variables
# endline
endlinecost.dat <- endlinecost.dat %>% 
  mutate( 
              # public
              pubcons = ifelse(c6_a2_endline<0 | c6_a2_endline==98 | c6_a2_endline==99 | is.na(c6_a2_endline)==T, 
                               0, c6_a2_endline),
              pubtests = ifelse(c6_a3_endline<0 | c6_a3_endline==98 | c6_a3_endline==99 | is.na(c6_a3_endline)==T, 
                                0, c6_a3_endline),
              pubmeds = ifelse(c6_a4_endline<0 | c6_a4_endline==98 | c6_a4_endline==99 | is.na(c6_a4_endline)==T, 
                               0, c6_a4_endline),
              pubtrans = ifelse(c6_a5_endline<0 | c6_a5_endline==98 | c6_a5_endline==99 | is.na(c6_a5_endline)==T, 
                                0, c6_a5_endline), # transport not just one way now
              pubchild = ifelse(c6_a6_endline<0 | c6_a6_endline==98 | c6_a6_endline==99 | is.na(c6_a6_endline)==T, 
                                0, c6_a6_endline),
              pubhouse = ifelse(c6_a7_endline<0 | c6_a7_endline==98 | c6_a7_endline==99 | is.na(c6_a7_endline)==T, 
                                0, c6_a7_endline),
              pubfood = ifelse(c6_a8_endline<0 | c6_a8_endline==98 | c6_a8_endline==99 | is.na(c6_a8_endline)==T, 
                               0, c6_a8_endline),
              pubphone = ifelse(c6_a9_endline<0 | c6_a9_endline==98 | c6_a9_endline==99 | is.na(c6_a9_endline)==T, 
                                0, c6_a9_endline),
              pubaccom = ifelse(c6_a10_endline<0 | c6_a10_endline==98 | c6_a10_endline==99 | is.na(c6_a10_endline)==T, 
                                0, c6_a10_endline),
              pubother = ifelse(c6_a11_endline<0 | c6_a11_endline==98 | c6_a11_endline==99 | is.na(c6_a11_endline)==T, 
                                0, c6_a11_endline),
              pubcost = ifelse(pub==1, numpub* (pubcons + pubtests + pubmeds + pubtrans + pubchild + 
                                                  pubhouse + pubfood + pubphone + pubaccom + pubother), 0),
             
              # private
              privcons = ifelse(c6_b2_endline<0 | c6_b2_endline==98 | c6_b2_endline==99 | is.na(c6_b2_endline)==T, 
                                0, c6_b2_endline),
              privtests = ifelse(c6_b3_endline<0 | c6_b3_endline==98 | c6_b3_endline==99 | is.na(c6_b3_endline)==T, 
                                 0, c6_b3_endline),
              privmeds = ifelse(c6_b4_endline<0 | c6_b4_endline==98 | c6_b4_endline==99 | is.na(c6_b4_endline)==T, 
                                0, c6_b4_endline),
              privtrans = ifelse(c6_b5_endline<0 | c6_b5_endline==98 | c6_b5_endline==99 | is.na(c6_b5_endline)==T, 
                                 0, c6_b5_endline),
              privchild = ifelse(c6_b6_endline<0 | c6_b6_endline==98 | c6_b6_endline==99 | is.na(c6_b6_endline)==T, 
                                 0, c6_b6_endline),
              privhouse = ifelse(c6_b7_endline<0 | c6_b7_endline==98 | c6_b7_endline==99 | is.na(c6_b7_endline)==T, 
                                 0, c6_b7_endline),
              privfood = ifelse(c6_b8_endline<0 | c6_b8_endline==98 | c6_b8_endline==99 | is.na(c6_b8_endline)==T, 
                                0, c6_b8_endline),
              privphone = ifelse(c6_b9_endline<0 | c6_b9_endline==98 | c6_b9_endline==99 | is.na(c6_b9_endline)==T, 
                                 0, c6_b9_endline),
              privaccom = ifelse(c6_b10_endline<0 | c6_b10_endline==98 | c6_b10_endline==99 | is.na(c6_b10_endline)==T, 
                                 0, c6_b10_endline),
              privother = ifelse(c6_b11_endline<0 | c6_b11_endline==98 | c6_b11_endline==99 | is.na(c6_b11_endline)==T, 
                                0, c6_b11_endline),
              privcost = ifelse(priv==1, numpriv* (privcons + privtests + privmeds + privtrans + privchild + 
                                                  privhouse + privfood + privphone + privaccom + privother), 0),
              
              # chemist
              chemcons = ifelse(c6_c2_endline<0 | c6_c2_endline==98 | c6_c2_endline==99 | is.na(c6_c2_endline)==T, 
                                0, c6_c2_endline),
              chemtests = ifelse(c6_c3_endline<0 | c6_c3_endline==98 | c6_c3_endline==99 | is.na(c6_c3_endline)==T, 
                                 0, c6_c3_endline),
              chemmeds = ifelse(c6_c4_endline<0 | c6_c4_endline==98 | c6_c4_endline==99 | is.na(c6_c4_endline)==T, 
                                0, c6_c4_endline),
              chemtrans = ifelse(c6_c5_endline<0 | c6_c5_endline==98 | c6_c5_endline==99 | is.na(c6_c5_endline)==T, 
                                 0, c6_c5_endline),
              chemchild = ifelse(c6_c6_endline<0 | c6_c6_endline==98 | c6_c6_endline==99 | is.na(c6_c6_endline)==T, 
                                 0, c6_c6_endline),
              chemhouse = ifelse(c6_c7_endline<0 | c6_c7_endline==98 | c6_c7_endline==99 | is.na(c6_c7_endline)==T, 
                                 0, c6_c7_endline),
              chemfood = ifelse(c6_c8_endline<0 | c6_c8_endline==98 | c6_c8_endline==99 | is.na(c6_c8_endline)==T, 
                                0, c6_c8_endline),
              chemphone = ifelse(c6_c9_endline<0 | c6_c9_endline==98 | c6_c9_endline==99 | is.na(c6_c9_endline)==T, 
                                 0, c6_c9_endline),
              chemaccom = ifelse(c6_c10_endline<0 | c6_c10_endline==98 | c6_c10_endline==99 | is.na(c6_c10_endline)==T, 
                                 0, c6_c10_endline),
              chemother = ifelse(c6_c11_endline<0 | c6_c11_endline==98 | c6_c11_endline==99 | is.na(c6_c11_endline)==T, 
                                0, c6_c11_endline),
              chemcost = ifelse(chem==1, numchem* (chemcons + chemtests + chemmeds + chemtrans + chemchild + 
                                                  chemhouse + chemfood + chemphone + chemaccom + chemother), 0),
              
              # traditional
              tradcons = ifelse(c6_d2_endline<0 | c6_d2_endline==98 | c6_d2_endline==99 | is.na(c6_d2_endline)==T, 
                                0, c6_d2_endline),
              tradtests = ifelse(c6_d3_endline<0 | c6_d3_endline==98 | c6_d3_endline==99 | is.na(c6_d3_endline)==T, 
                                 0, c6_d3_endline),
              tradmeds = ifelse(c6_d4_endline<0 | c6_d4_endline==98 | c6_d4_endline==99 | is.na(c6_d4_endline)==T, 
                                0, c6_d4_endline),
              tradtrans = ifelse(c6_d5_endline<0 | c6_d5_endline==98 | c6_d5_endline==99 | is.na(c6_d5_endline)==T, 
                                 0, c6_d5_endline),
              tradchild = ifelse(c6_d6_endline<0 | c6_d6_endline==98 | c6_d6_endline==99 | is.na(c6_d6_endline)==T, 
                                 0, c6_d6_endline),
              tradhouse = ifelse(c6_d7_endline<0 | c6_d7_endline==98 | c6_d7_endline==99 | is.na(c6_d7_endline)==T, 
                                 0, c6_d7_endline),
              tradfood = ifelse(c6_d8_endline<0 | c6_d8_endline==98 | c6_d8_endline==99 | is.na(c6_d8_endline)==T, 
                                0, c6_d8_endline),
              tradphone = ifelse(c6_d9_endline<0 | c6_d9_endline==98 | c6_d9_endline==99 | is.na(c6_d9_endline)==T, 
                                 0, c6_d9_endline),
              tradaccom = ifelse(c6_d10_endline<0 | c6_d10_endline==98 | c6_d10_endline==99 | is.na(c6_d10_endline)==T, 
                                 0, c6_d10_endline),
              tradother = ifelse(c6_d11_endline<0 | c6_d11_endline==98 | c6_d11_endline==99 | is.na(c6_d11_endline)==T, 
                                0, c6_d11_endline),
              tradcost = ifelse(trad==1, numtrad* (tradcons + tradtests + tradmeds + tradtrans + tradchild + 
                                                  tradhouse + tradfood + tradphone + tradaccom + tradother), 0),
              
              # diviner
              divcons = ifelse(c6_e2_endline<0 | c6_e2_endline==98 | c6_e2_endline==99 | is.na(c6_e2_endline)==T, 
                               0, c6_e2_endline),
              divtests = ifelse(c6_e3_endline<0 | c6_e3_endline==98 | c6_e3_endline==99 | is.na(c6_e3_endline)==T, 
                                0, c6_e3_endline),
              divmeds = ifelse(c6_e4_endline<0 | c6_e4_endline==98 | c6_e4_endline==99 | is.na(c6_e4_endline)==T, 
                               0, c6_e4_endline),
              divtrans = ifelse(c6_e5_endline<0 | c6_e5_endline==98 | c6_e5_endline==99 | is.na(c6_e5_endline)==T, 
                                0, c6_e5_endline),
              divchild = ifelse(c6_e6_endline<0 | c6_e6_endline==98 | c6_e6_endline==99 | is.na(c6_e6_endline)==T, 
                                0, c6_e6_endline),
              divhouse = ifelse(c6_e7_endline<0 | c6_e7_endline==98 | c6_e7_endline==99 | is.na(c6_e7_endline)==T, 
                                0, c6_e7_endline),
              divfood = ifelse(c6_e8_endline<0 | c6_e8_endline==98 | c6_e8_endline==99 | is.na(c6_e8_endline)==T, 
                               0, c6_e8_endline),
              divphone = ifelse(c6_e9_endline<0 | c6_e9_endline==98 | c6_e9_endline==99 | is.na(c6_e9_endline)==T, 
                                0, c6_e9_endline),
              divaccom = ifelse(c6_e10_endline<0 | c6_e10_endline==98 | c6_e10_endline==99 | is.na(c6_e10_endline)==T, 
                                0, c6_e10_endline),
              divother = ifelse(c6_e11_endline<0 | c6_e11_endline==98 | c6_e11_endline==99 | is.na(c6_e11_endline)==T, 
                                0, c6_e11_endline),
              divcost = ifelse(div==1, numdiv* (divcons + divtests + divmeds + divtrans + divchild + 
                                                  divhouse + divfood + divphone + divaccom + divother), 0),
              
              # faith
              faithcons = ifelse(c6_f2_endline<0 | c6_f2_endline==98 | c6_f2_endline==99 | is.na(c6_f2_endline)==T, 
                                 0, c6_f2_endline),
              faithtests = ifelse(c6_f3_endline<0 | c6_f3_endline==98 | c6_f3_endline==99 | is.na(c6_f3_endline)==T, 
                                  0, c6_f3_endline),
              faithmeds = ifelse(c6_f4_endline<0 | c6_f4_endline==98 | c6_f4_endline==99 | is.na(c6_f4_endline)==T, 
                                 0, c6_f4_endline),
              faithtrans = ifelse(c6_f5_endline<0 | c6_f5_endline==98 | c6_f5_endline==99 | is.na(c6_f5_endline)==T, 
                                  0, c6_f5_endline),
              faithchild = ifelse(c6_f6_endline<0 | c6_f6_endline==98 | c6_f6_endline==99 | is.na(c6_f6_endline)==T, 
                                  0, c6_f6_endline),
              faithhouse = ifelse(c6_f7_endline<0 | c6_f7_endline==98 | c6_f7_endline==99 | is.na(c6_f7_endline)==T, 
                                  0, c6_f7_endline),
              faithfood = ifelse(c6_f8_endline<0 | c6_f8_endline==98 | c6_f8_endline==99 | is.na(c6_f8_endline)==T, 
                                 0, c6_f8_endline),
              faithphone = ifelse(c6_f9_endline<0 | c6_f9_endline==98 | c6_f9_endline==99 | is.na(c6_f9_endline)==T, 
                                  0, c6_f9_endline),
              faithaccom = ifelse(c6_f10_endline<0 | c6_f10_endline==98 | c6_f10_endline==99 | is.na(c6_f10_endline)==T, 
                                  0, c6_f10_endline),
              faithother = ifelse(c6_f11_endline<0 | c6_f11_endline==98 | c6_f11_endline==99 | is.na(c6_f11_endline)==T, 
                                0, c6_f11_endline),
              faithcost = ifelse(faith==1, numfaith* (faithcons + faithtests + faithmeds + faithtrans + faithchild + 
                                                  faithhouse + faithfood + faithphone + faithaccom + faithother), 0),
              
              totcost = pubcost + privcost + chemcost + tradcost + divcost + faithcost)


# baseline
baselinecost.dat <- baselinecost.dat %>% 
  mutate( 
              # public
              pubcons = ifelse(c6_a2_baseline<0 | c6_a2_baseline==98 | c6_a2_baseline==99 | is.na(c6_a2_baseline)==T, 
                               0, c6_a2_baseline),
              pubtests = ifelse(c6_a3_baseline<0 | c6_a3_baseline==98 | c6_a3_baseline==99 | is.na(c6_a3_baseline)==T, 
                                0, c6_a3_baseline),
              pubmeds = ifelse(c6_a4_baseline<0 | c6_a4_baseline==98 | c6_a4_baseline==99 | is.na(c6_a4_baseline)==T, 
                               0, c6_a4_baseline),
              pubtrans = ifelse(c6_a5_baseline<0 | c6_a5_baseline==98 | c6_a5_baseline==99 | is.na(c6_a5_baseline)==T, 
                                0, c6_a5_baseline), # transport not just one way now
              pubchild = ifelse(c6_a6_baseline<0 | c6_a6_baseline==98 | c6_a6_baseline==99 | is.na(c6_a6_baseline)==T, 
                                0, c6_a6_baseline),
              pubhouse = ifelse(c6_a7_baseline<0 | c6_a7_baseline==98 | c6_a7_baseline==99 | is.na(c6_a7_baseline)==T, 
                                0, c6_a7_baseline),
              pubfood = ifelse(c6_a8_baseline<0 | c6_a8_baseline==98 | c6_a8_baseline==99 | is.na(c6_a8_baseline)==T, 
                               0, c6_a8_baseline),
              pubphone = ifelse(c6_a9_baseline<0 | c6_a9_baseline==98 | c6_a9_baseline==99 | is.na(c6_a9_baseline)==T, 
                                0, c6_a9_baseline),
              pubaccom = ifelse(c6_a10_baseline<0 | c6_a10_baseline==98 | c6_a10_baseline==99 | is.na(c6_a10_baseline)==T, 
                                0, c6_a10_baseline),
              pubother = ifelse(c6_a11_baseline<0 | c6_a11_baseline==98 | c6_a11_baseline==99 | is.na(c6_a11_baseline)==T, 
                                0, c6_a11_baseline),
              pubcost = ifelse(pub==1, numpub* (pubcons + pubtests + pubmeds + pubtrans + pubchild + 
                                                  pubhouse + pubfood + pubphone + pubaccom + pubother), 0),
             
              # private
              privcons = ifelse(c6_b2_baseline<0 | c6_b2_baseline==98 | c6_b2_baseline==99 | is.na(c6_b2_baseline)==T, 
                                0, c6_b2_baseline),
              privtests = ifelse(c6_b3_baseline<0 | c6_b3_baseline==98 | c6_b3_baseline==99 | is.na(c6_b3_baseline)==T, 
                                 0, c6_b3_baseline),
              privmeds = ifelse(c6_b4_baseline<0 | c6_b4_baseline==98 | c6_b4_baseline==99 | is.na(c6_b4_baseline)==T, 
                                0, c6_b4_baseline),
              privtrans = ifelse(c6_b5_baseline<0 | c6_b5_baseline==98 | c6_b5_baseline==99 | is.na(c6_b5_baseline)==T, 
                                 0, c6_b5_baseline),
              privchild = ifelse(c6_b6_baseline<0 | c6_b6_baseline==98 | c6_b6_baseline==99 | is.na(c6_b6_baseline)==T, 
                                 0, c6_b6_baseline),
              privhouse = ifelse(c6_b7_baseline<0 | c6_b7_baseline==98 | c6_b7_baseline==99 | is.na(c6_b7_baseline)==T, 
                                 0, c6_b7_baseline),
              privfood = ifelse(c6_b8_baseline<0 | c6_b8_baseline==98 | c6_b8_baseline==99 | is.na(c6_b8_baseline)==T, 
                                0, c6_b8_baseline),
              privphone = ifelse(c6_b9_baseline<0 | c6_b9_baseline==98 | c6_b9_baseline==99 | is.na(c6_b9_baseline)==T, 
                                 0, c6_b9_baseline),
              privaccom = ifelse(c6_b10_baseline<0 | c6_b10_baseline==98 | c6_b10_baseline==99 | is.na(c6_b10_baseline)==T, 
                                 0, c6_b10_baseline),
              privother = ifelse(c6_b11_baseline<0 | c6_b11_baseline==98 | c6_b11_baseline==99 | is.na(c6_b11_baseline)==T, 
                                0, c6_b11_baseline),
              privcost = ifelse(priv==1, numpriv* (privcons + privtests + privmeds + privtrans + privchild + 
                                                  privhouse + privfood + privphone + privaccom + privother), 0),
              
              # chemist
              chemcons = ifelse(c6_c2_baseline<0 | c6_c2_baseline==98 | c6_c2_baseline==99 | is.na(c6_c2_baseline)==T, 
                                0, c6_c2_baseline),
              chemtests = ifelse(c6_c3_baseline<0 | c6_c3_baseline==98 | c6_c3_baseline==99 | is.na(c6_c3_baseline)==T, 
                                 0, c6_c3_baseline),
              chemmeds = ifelse(c6_c4_baseline<0 | c6_c4_baseline==98 | c6_c4_baseline==99 | is.na(c6_c4_baseline)==T, 
                                0, c6_c4_baseline),
              chemtrans = ifelse(c6_c5_baseline<0 | c6_c5_baseline==98 | c6_c5_baseline==99 | is.na(c6_c5_baseline)==T, 
                                 0, c6_c5_baseline),
              chemchild = ifelse(c6_c6_baseline<0 | c6_c6_baseline==98 | c6_c6_baseline==99 | is.na(c6_c6_baseline)==T, 
                                 0, c6_c6_baseline),
              chemhouse = ifelse(c6_c7_baseline<0 | c6_c7_baseline==98 | c6_c7_baseline==99 | is.na(c6_c7_baseline)==T, 
                                 0, c6_c7_baseline),
              chemfood = ifelse(c6_c8_baseline<0 | c6_c8_baseline==98 | c6_c8_baseline==99 | is.na(c6_c8_baseline)==T, 
                                0, c6_c8_baseline),
              chemphone = ifelse(c6_c9_baseline<0 | c6_c9_baseline==98 | c6_c9_baseline==99 | is.na(c6_c9_baseline)==T, 
                                 0, c6_c9_baseline),
              chemaccom = ifelse(c6_c10_baseline<0 | c6_c10_baseline==98 | c6_c10_baseline==99 | is.na(c6_c10_baseline)==T, 
                                 0, c6_c10_baseline),
              chemother = ifelse(c6_c11_baseline<0 | c6_c11_baseline==98 | c6_c11_baseline==99 | is.na(c6_c11_baseline)==T, 
                                0, c6_c11_baseline),
              chemcost = ifelse(chem==1, numchem* (chemcons + chemtests + chemmeds + chemtrans + chemchild + 
                                                  chemhouse + chemfood + chemphone + chemaccom + chemother), 0),
              
              # traditional
              tradcons = ifelse(c6_d2_baseline<0 | c6_d2_baseline==98 | c6_d2_baseline==99 | is.na(c6_d2_baseline)==T, 
                                0, c6_d2_baseline),
              tradtests = ifelse(c6_d3_baseline<0 | c6_d3_baseline==98 | c6_d3_baseline==99 | is.na(c6_d3_baseline)==T, 
                                 0, c6_d3_baseline),
              tradmeds = ifelse(c6_d4_baseline<0 | c6_d4_baseline==98 | c6_d4_baseline==99 | is.na(c6_d4_baseline)==T, 
                                0, c6_d4_baseline),
              tradtrans = ifelse(c6_d5_baseline<0 | c6_d5_baseline==98 | c6_d5_baseline==99 | is.na(c6_d5_baseline)==T, 
                                 0, c6_d5_baseline),
              tradchild = ifelse(c6_d6_baseline<0 | c6_d6_baseline==98 | c6_d6_baseline==99 | is.na(c6_d6_baseline)==T, 
                                 0, c6_d6_baseline),
              tradhouse = ifelse(c6_d7_baseline<0 | c6_d7_baseline==98 | c6_d7_baseline==99 | is.na(c6_d7_baseline)==T, 
                                 0, c6_d7_baseline),
              tradfood = ifelse(c6_d8_baseline<0 | c6_d8_baseline==98 | c6_d8_baseline==99 | is.na(c6_d8_baseline)==T, 
                                0, c6_d8_baseline),
              tradphone = ifelse(c6_d9_baseline<0 | c6_d9_baseline==98 | c6_d9_baseline==99 | is.na(c6_d9_baseline)==T, 
                                 0, c6_d9_baseline),
              tradaccom = ifelse(c6_d10_baseline<0 | c6_d10_baseline==98 | c6_d10_baseline==99 | is.na(c6_d10_baseline)==T, 
                                 0, c6_d10_baseline),
              tradother = ifelse(c6_d11_baseline<0 | c6_d11_baseline==98 | c6_d11_baseline==99 | is.na(c6_d11_baseline)==T, 
                                0, c6_d11_baseline),
              tradcost = ifelse(trad==1, numtrad* (tradcons + tradtests + tradmeds + tradtrans + tradchild + 
                                                  tradhouse + tradfood + tradphone + tradaccom + tradother), 0),
              
              # diviner
              divcons = ifelse(c6_e2_baseline<0 | c6_e2_baseline==98 | c6_e2_baseline==99 | is.na(c6_e2_baseline)==T, 
                               0, c6_e2_baseline),
              divtests = ifelse(c6_e3_baseline<0 | c6_e3_baseline==98 | c6_e3_baseline==99 | is.na(c6_e3_baseline)==T, 
                                0, c6_e3_baseline),
              divmeds = ifelse(c6_e4_baseline<0 | c6_e4_baseline==98 | c6_e4_baseline==99 | is.na(c6_e4_baseline)==T, 
                               0, c6_e4_baseline),
              divtrans = ifelse(c6_e5_baseline<0 | c6_e5_baseline==98 | c6_e5_baseline==99 | is.na(c6_e5_baseline)==T, 
                                0, c6_e5_baseline),
              divchild = ifelse(c6_e6_baseline<0 | c6_e6_baseline==98 | c6_e6_baseline==99 | is.na(c6_e6_baseline)==T, 
                                0, c6_e6_baseline),
              divhouse = ifelse(c6_e7_baseline<0 | c6_e7_baseline==98 | c6_e7_baseline==99 | is.na(c6_e7_baseline)==T, 
                                0, c6_e7_baseline),
              divfood = ifelse(c6_e8_baseline<0 | c6_e8_baseline==98 | c6_e8_baseline==99 | is.na(c6_e8_baseline)==T, 
                               0, c6_e8_baseline),
              divphone = ifelse(c6_e9_baseline<0 | c6_e9_baseline==98 | c6_e9_baseline==99 | is.na(c6_e9_baseline)==T, 
                                0, c6_e9_baseline),
              divaccom = ifelse(c6_e10_baseline<0 | c6_e10_baseline==98 | c6_e10_baseline==99 | is.na(c6_e10_baseline)==T, 
                                0, c6_e10_baseline),
              divother = ifelse(c6_e11_baseline<0 | c6_e11_baseline==98 | c6_e11_baseline==99 | is.na(c6_e11_baseline)==T, 
                                0, c6_e11_baseline),
              divcost = ifelse(div==1, numdiv* (divcons + divtests + divmeds + divtrans + divchild + 
                                                  divhouse + divfood + divphone + divaccom + divother), 0),
              
              # faith
              faithcons = ifelse(c6_f2_baseline<0 | c6_f2_baseline==98 | c6_f2_baseline==99 | is.na(c6_f2_baseline)==T, 
                                 0, c6_f2_baseline),
              faithtests = ifelse(c6_f3_baseline<0 | c6_f3_baseline==98 | c6_f3_baseline==99 | is.na(c6_f3_baseline)==T, 
                                  0, c6_f3_baseline),
              faithmeds = ifelse(c6_f4_baseline<0 | c6_f4_baseline==98 | c6_f4_baseline==99 | is.na(c6_f4_baseline)==T, 
                                 0, c6_f4_baseline),
              faithtrans = ifelse(c6_f5_baseline<0 | c6_f5_baseline==98 | c6_f5_baseline==99 | is.na(c6_f5_baseline)==T, 
                                  0, c6_f5_baseline),
              faithchild = ifelse(c6_f6_baseline<0 | c6_f6_baseline==98 | c6_f6_baseline==99 | is.na(c6_f6_baseline)==T, 
                                  0, c6_f6_baseline),
              faithhouse = ifelse(c6_f7_baseline<0 | c6_f7_baseline==98 | c6_f7_baseline==99 | is.na(c6_f7_baseline)==T, 
                                  0, c6_f7_baseline),
              faithfood = ifelse(c6_f8_baseline<0 | c6_f8_baseline==98 | c6_f8_baseline==99 | is.na(c6_f8_baseline)==T, 
                                 0, c6_f8_baseline),
              faithphone = ifelse(c6_f9_baseline<0 | c6_f9_baseline==98 | c6_f9_baseline==99 | is.na(c6_f9_baseline)==T, 
                                  0, c6_f9_baseline),
              faithaccom = ifelse(c6_f10_baseline<0 | c6_f10_baseline==98 | c6_f10_baseline==99 | is.na(c6_f10_baseline)==T, 
                                  0, c6_f10_baseline),
              faithother = ifelse(c6_f11_baseline<0 | c6_f11_baseline==98 | c6_f11_baseline==99 | is.na(c6_f11_baseline)==T, 
                                0, c6_f11_baseline),
              faithcost = ifelse(faith==1, numfaith* (faithcons + faithtests + faithmeds + faithtrans + faithchild + 
                                                  faithhouse + faithfood + faithphone + faithaccom + faithother), 0),
              
              totcost = pubcost + privcost + chemcost + tradcost + divcost + faithcost)

```


 
```{r export dataset, eval=FALSE}
write_dta(baselinecost.dat, "baselinecostdat_2018-04-30.dta", version=13)
write_dta(endlinecost.dat, "endlinecostdat_2018-04-30.dta", version=13)

```




